Curious what congress and Trump are saying about the coronavirus?

This analysis takes a look at the number, the positive and negative sentiment, and the content of the tweets, for each party and how that’s evolving over time since February 1, 2020.

I utilize open data hosted online. In particular, big thanks to Alex Litel who created the Tweets of Congress repo, where I pulled congressional tweets from, and the folks running Trump Twitter Archive, where I pulled Trump’s tweets from.

The repo for this project is: https://github.com/dcosme/congress-tweets-covid19

prep data

load packages

define palettes

load congress twitter handles

congress_twitter = read.csv("~/Documents/code/US-Congress/116thCongress/116Congress.csv", stringsAsFactors = FALSE) %>%
  rename("name" = Wikipedia..Names) %>%
  gather(handle_type, twitter_handle, ODU.WSDL, CSPAN, TweetCongress, Github) %>%
  select(name, handle_type, twitter_handle) %>%
  mutate(twitter_handle = tolower(twitter_handle),
         twitter_handle = ifelse(twitter_handle == "", NA, twitter_handle),
         name = gsub("<e9>", "é", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("<e1>", "á", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("<fa>", "ú", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("<ed>", "í", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("é", "e", name),
         name = gsub("á", "a", name),
         name = gsub("ú", "u", name),
         name = gsub("í", "i", name),
         name = trimws(name)) %>%
  extract(name, c("first", "last"), "([A-Za-z]{1}).* (.*)", remove = FALSE) %>%
  spread(handle_type, twitter_handle)

congress = read.csv("~/Documents/code/us-senate/us-senate/data/us-senate.csv", stringsAsFactors = FALSE) %>%
  bind_rows(read.csv("~/Documents/code/us-house/us-house/data/us-house.csv", stringsAsFactors = FALSE)) %>%
  select(state_name, title, party, name, gender, ethnicity, twitter_handle) %>%
  mutate(twitter_handle = tolower(twitter_handle),
         twitter_handle = ifelse(twitter_handle == "", NA,
                          ifelse(twitter_handle == "housedemocrats", NA,
                          ifelse(twitter_handle == "senatorloeffler?lang=en", NA, twitter_handle))),
         name = gsub("é", "e", name),
         name = gsub("á", "a", name),
         name = gsub("ú", "u", name),
         name = gsub("í", "i", name),
         name = trimws(name)) %>%
  extract(name, c("first", "last"), "([A-Za-z]{1}).* (.*)", remove = FALSE)

congress_info = full_join(congress, congress_twitter, by = c("first", "last")) %>%
  gather(handle_type, twitter_handle, twitter_handle, ODU.WSDL, CSPAN, TweetCongress, Github) %>%
  select(state_name, title, party, first, last, gender, ethnicity, twitter_handle) %>%
  group_by(first, last) %>%
  fill(state_name, title, party, gender, ethnicity, twitter_handle, .direction = "updown") %>%
  unique() %>%
  filter(!is.na(state_name)) %>%
  ungroup() %>%
  mutate(last = tolower(last))

load congressional tweets

pull to update repo

## From https://github.com/alexlitel/congresstweets
##  * branch            master     -> FETCH_HEAD
##    b357d07..cc55d59  master     -> origin/master
## Updating b357d07..cc55d59
## Fast-forward
##  _posts/2020-03-15--tweets.md | 8 ++++++++
##  data/2020-03-15.json         | 1 +
##  2 files changed, 9 insertions(+)
##  create mode 100644 _posts/2020-03-15--tweets.md
##  create mode 100644 data/2020-03-15.json

define keywords and words to ignore

load the files

find missing congressional twitter handles

load trump tweets

merge tweets

define wordcloud function

plot_number = function(data, timescale = "week", start_date = NULL, palette = palette4) {

  if (!is.null(start_date)) {
      data = data %>%
        filter(day >= start_date)
  }
  
  plot1 = data %>%
    filter(!is.na(party)) %>%
    ggplot(aes(!!sym(timescale), fill = party)) +
    geom_bar(stat = "count") +
    scale_fill_manual(name = "", values = palette) +
    labs(x = "", y = "number of tweets\n") +
    theme_minimal(base_size = 14) +
    theme(legend.position = "top")

  plot2 = data %>%
    filter(!is.na(party)) %>%
    ggplot(aes(!!sym(timescale), color = party)) +
    geom_line(stat = "count") +
    scale_color_manual(name = "", values = palette) +
    labs(x = "", y = "number of tweets\n") +
    theme_minimal(base_size = 14) +
    theme(legend.position = "top")
  
  return(list(plot1 = plot1, plot2 = plot2))
  
}

plot_sentiment = function(data, n_words = 20, 
                          start_date = NULL, duration = NULL, 
                          palette = palette2) {

  if (!is.null(start_date)) {
    if (!is.null(duration)) {
      data = data %>%
        filter(day >= start_date & day <= lubridate::date(start_date) + lubridate::days(duration))
    } else {
      data = data %>%
        filter(day >= start_date)
    }
  }
  
  sentiments = data %>%
    unnest_tokens(word, text) %>%
    inner_join(tidytext::get_sentiments("bing")) %>%
    anti_join(stop_words, by = "word") %>%
    filter(!grepl(ignore_root_words, word)) %>%
    filter(!word %in% ignore_words) %>%
    filter(!word == "trump") %>%
    group_by(party) %>%
    count(word, sentiment, sort = TRUE) %>%
    filter(party %in% c("democrat", "republican", "trump")) %>%
    top_n(n_words) %>%
    group_by(party) %>%
    arrange(n) %>%
    mutate(order = row_number())
  
  sentiments %>%
    ggplot(aes(drlib::reorder_within(word, n, party), n, fill = sentiment)) +
    geom_col() +
    drlib::scale_x_reordered() +
    facet_wrap(~party, scales = "free") +
    labs(y = "\nnumber of times tweeted",
         x = NULL) +
    coord_flip() +
    scale_fill_manual(name = "", values = palette) +
    theme_minimal(base_size = 14) +
    theme(legend.position = "top")
  
}

plot_content = function(data, party=NULL, start_date=NULL, duration=NULL, n_words=50, n_colors=6, size=20) {
  
  data = data %>%
      mutate(day = lubridate::as_date(day))
  
  if (!is.null(party)) {
    data = data %>%
      filter(party == !!party)
  }
  
  if (!is.null(start_date)) {
    if (!is.null(duration)) {
      data = data %>%
        filter(day >= start_date & day <= lubridate::date(start_date) + lubridate::days(duration))
    } else {
      data = data %>%
        filter(day >= start_date)
    }
  }
  
  palette = wesanderson::wes_palette("Zissou1", n_colors, "continuous")
  
  set.seed(42)
  
plot = data %>%
    filter(party %in% c("democrat", "republican", "trump")) %>%
    select(text, party) %>%
    unnest_tokens(word, text) %>%
    group_by(party) %>%
    count(word, sort = TRUE) %>%
    anti_join(stop_words, by = "word") %>%
    filter(!grepl(ignore_root_words, word)) %>%
    filter(!word %in% ignore_words) %>%
    slice(1:n_words) %>%
    mutate(word = gsub("[[:punct:]]", "", word),
           sum = sum(n),
           size = n / sum,
           tile = ntile(n, n_colors)) %>%
    ggplot(aes(label = word, size = size, color = as.factor(tile))) +
    geom_text_wordcloud_area(shape = "square", rm_outside = TRUE) +
      scale_size_area(max_size = size, trans = power_trans(1/.7)) +
      scale_color_manual(values = palette) +
      facet_wrap(~party) +
      theme_minimal() +
      theme(strip.text.x = element_text(size = 12))
  
  return(list(plot = plot, data = data))
}

number of tweets

How many times has Congress and the President tweeted about COVID-19?

daily

## $plot1

## 
## $plot2

weekly

## $plot1

## 
## $plot2

content of tweets

What are Congress and the President saying about COVID-19?

Here are 100 the most frequently used words by each party and the President.

by week

Here are 50 the most frequently used words by each party and the President for each week in February and March.

browse tweets

Who’s tweeting the most and what are they tweeting?

number of tweets

word counts

tweets